Principal Component Analysis as Special case of Reduced Rank Regression

rm(list = ls())
library(tidyverse)
library(dplyr)
library(rrr)
library(ggplot2)
library(scatterplot3d)
library(plotly)

data("iris")

iris_data = iris %>%
       as.tibble()  
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
X  = iris_data[,1:4]
X = t(X)
mux = apply(X, 1, mean)
# Normalize data points
X_c = apply(X, 1, function(X)(X-mean(X)) )
n = nrow(X)

PCA as special case of RRR from first principals using Eigendecomposition (Spectral decomposition)

sigmaxx = cov(X_c) 
decomp = eigen(sigmaxx)

eigenvalues    = decomp$values
eigenvectors   = decomp$vectors
total_variance = sum(eigenvalues)
variance_explained = eigenvalues/total_variance


# We proceed to perform cross validation by selecting different ranks 

V = eigenvectors
A = V
B = t(V)

# Rank 1
A1 = matrix(V[,1], ncol = 1, byrow = TRUE)
B1 = matrix(B[1,], ncol = 4, byrow = TRUE)
C1 = A1%*%B1
#pca_scores_1 = t(tcrossprod(C1,X_c))
x_1_reconstructed = t(mux + C1%*%t(X_c)) # ?
lse_1 <- sum((t(X) - x_1_reconstructed)^2)

# Rank 2
A2 = matrix(V[,1:2], ncol = 2, byrow = FALSE)
B2 = matrix(B[1:2,], ncol = 4, byrow = FALSE)
C2 = A2%*%B2
#pca_scores_2 = t(tcrossprod(C1,X_c))
x_2_reconstructed = t(mux+tcrossprod(C2,X_c))
lse_2 <- sum((t(X) - x_2_reconstructed)^2)

# Rank 3
A3 = matrix(V[,1:3], ncol = 3, byrow = FALSE)
B3 = matrix(B[1:3,], ncol = 4, byrow = FALSE)
C3 = A3%*%B3
#pca_scores_3 = t(tcrossprod(C1,X_c))
x_3_reconstructed = t(mux+tcrossprod(C3,X_c))
lse_3 <- sum((t(X) - x_3_reconstructed)^2)

# Full Rank
C = A%*%B

col_names = colnames(X_c)
colnames(C) = col_names
pca_scores = t(tcrossprod(C,X_c))
x_reconstructed = t(mux+tcrossprod(C,X_c))
x_og = t(X)
lse <- sum((t(X) - x_reconstructed)^2)
colnames(x_reconstructed) = col_names

PCA as special case of RRR from first principals using SVD approach

decomp2        = svd(X_c)
eigenvalues    = (decomp2$d^2)/(n-1)
eigenvectors   = decomp2$v
total_variance_svd = sum(eigenvalues)
variance_explained_svd = eigenvalues/total_variance_svd

# We proceed to perform cross validation by selecting different ranks 

V = eigenvectors
A = V
B = t(V)
  

# Rank 1
A1 = matrix(V[,1], ncol = 1, byrow = TRUE)
B1 = matrix(B[1,], ncol = 4, byrow = TRUE)
C1_svd = A1%*%B1
x_1_reconstructed = t(mux+tcrossprod(C1_svd,X_c))
lse_1_svd <- sum((t(X) - x_1_reconstructed)^2)


# Rank 2
A2 = matrix(V[,1:2], ncol = 2, byrow = FALSE)
B2 = matrix(B[1:2,], ncol = 4, byrow = FALSE)
C2_svd = A2%*%B2
x_2_reconstructed = t(mux+tcrossprod(C2_svd,X_c))
lse_2_svd <- sum((t(X) - x_2_reconstructed)^2)

# Rank 3
A3 = matrix(V[,1:3], ncol = 3, byrow = FALSE)
B3 = matrix(B[1:3,], ncol = 4, byrow = FALSE)
C3_svd = A3%*%B3
x_3_reconstructed = t(mux+tcrossprod(C3_svd,X_c))
lse_3_svd <- sum((t(X) - x_3_reconstructed)^2)

# Full Rank
C_svd = A%*%B

col_names = colnames(X_c)
colnames(C) = col_names
x_reconstructed = t(mux+tcrossprod(C_svd,X_c))
lse_svd <- sum((t(X) - x_reconstructed)^2)
colnames(x_reconstructed) = col_names  

PCA as special case of RRR using R package rrr

# 
rrr_model = rrr(X_c, X_c, k = 0, type="pca", rank = 2)

summary(rrr_model)
##                 Length Class  Mode   
## means           4      -none- numeric
## C               4      tbl_df list   
## PC              2      tbl_df list   
## goodness_of_fit 2      -none- numeric
rrr_model$PC
## # A tibble: 4 × 2
##       PC1     PC2
##     <dbl>   <dbl>
## 1  0.361   0.657 
## 2 -0.0845  0.730 
## 3  0.857  -0.173 
## 4  0.358  -0.0755
rrr_model$C == C2
##        V1   V2   V3   V4
## [1,] TRUE TRUE TRUE TRUE
## [2,] TRUE TRUE TRUE TRUE
## [3,] TRUE TRUE TRUE TRUE
## [4,] TRUE TRUE TRUE TRUE

Classic PCA

classic_pca = prcomp(X_c, retx = TRUE, center = FALSE, scale. = FALSE)


pca_scores = classic_pca$x%>% # Linear combination of centered data and eigenvectors
             as.tibble()
pca_classic_variance_explained = (classic_pca$sdev^2)/sum(classic_pca$sdev^2)

pca_classic_variance_explained_df = data.frame(`PC` = 1:length(pca_classic_variance_explained), `Var Explained` = pca_classic_variance_explained) %>% as.tibble()

summary(classic_pca)
## Importance of components:
##                           PC1     PC2    PC3     PC4
## Standard deviation     2.0563 0.49262 0.2797 0.15439
## Proportion of Variance 0.9246 0.05307 0.0171 0.00521
## Cumulative Proportion  0.9246 0.97769 0.9948 1.00000
# Plot Scree Plot
ggplot(pca_classic_variance_explained_df, aes(x = PC, y = Var.Explained)) +
  geom_point() + # Add points
  geom_line() + # Connect points with lines
  xlab("Principal Component") +
  ylab("Proportion of Variance Explained") +
  ggtitle("Scree Plot") +
  theme_minimal() # Using a minimal theme for aesthetics

# Plot PCA Scores
ggplot(data = pca_scores, mapping = aes(x = PC1, y = PC2))+
      geom_point()

scatterplot3d(x= pca_scores$PC1, y = pca_scores$PC2, z = pca_scores$PC3, main="Principal Components", xlab="PC1", ylab="PC2", zlab="PC3", pch=19, color="blue")

fig <- plot_ly(x = ~pca_scores$PC1, y = ~pca_scores$PC2, z = ~pca_scores$PC3, type = 'scatter3d', mode = 'markers',
               marker = list(size = 5, color = 'blue'))

# Customize the layout
fig <- fig %>% layout(title = '3D Scatter Plot',
                      scene = list(xaxis = list(title = 'PC1'),
                                   yaxis = list(title = 'PC2'),
                                   zaxis = list(title = 'PC3')))

# Render the plot
fig